#!pip install geopandas
import pandas as pd
import geopandas as gpd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from tqdm import tqdm
from textblob import Blobber
import os
import plotly.express as px
import json
import re
def import_data(listings_file, reviews_file, neighborhood_geojson):
'''Extracts data from file source path and returns pandas dataframes and geopandas dataframe'''
return pd.read_csv(listings_file), pd.read_csv(reviews_file), gpd.read_file(neighborhood_geojson)
def data_clean():
'''Cleans data to remove unnecessary punctuation, fixing the datatypes of columns'''
global neighborhood_geo_data, listings_data, reviews_data
listings_data['price'] = listings_data['price'].str[1:]
listings_data['price'] = listings_data['price'].apply(lambda x: x.replace(',','') if "," in x else x)
listings_data['price'] = listings_data['price'].astype("float")
reviews_data['reviewer_id'] = reviews_data['reviewer_id'].astype("str")
neighborhood_geo_data = neighborhood_geo_data.drop('neighbourhood_group', axis=1)
def plot_corr_price():
'''Function to plot correlation of multiple columns listed below'''
corr = listings_data[['price','minimum_nights','accommodates', 'beds', 'bedrooms','host_total_listings_count','number_of_reviews']].corr()
# mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
def sentiment_list(nb_analyzer=False):
'''
Creates a setiment analysis list for reviews of different houses. Uses NaiveBayes for analysis
if flag is set, if not uses TextBlob to return sentiment subjectivity of the review
'''
NB_list = []
if nb_analyzer:
tb = Blobber(analyzer=NaiveBayesAnalyzer())
for i in tqdm(reviews_data['comments']):
NB_list.append(tb(str(i)).sentiment.p_pos if i!=np.nan else 0.5)
else:
for i in tqdm(reviews_data['comments']):
NB_list.append(TextBlob(str(i)).sentiment.subjectivity if i!=np.nan else 0.5)
return NB_list
def best_worst_listings(data):
'''
Creates a plotly plot for the best and worst listings
'''
data = data.groupby(by="listing_id").mean()['nb_opinion_bins'].sort_values(ascending=False)
limit=5
fig = make_subplots(rows=2, cols=1, subplot_titles=("Best Customer Satisfaction listings", "Worst Customer Satisfaction listings"))
fig.add_trace( go.Bar(y = data.values[:limit], x = data.index.astype('str')[:limit]), row=1, col=1)
fig.add_trace( go.Bar(y = data.values[-limit:], x = data.index.astype('str')[-limit:]), row=2, col=1)
return fig
def get_top_reviewers():
'''
Calculate top reviewers based on the number of reviews and returns dataframe with the data in desc order
'''
top_reviewer_details = reviews_data.groupby(by="reviewer_id").count()["id"].reset_index().sort_values(by="id", ascending = False)
top_reviewer_details = pd.merge(top_reviewer_details, reviews_data, how="inner", left_on="reviewer_id", right_on="reviewer_id")[['reviewer_id','reviewer_name','id_x']].drop_duplicates()
top_reviewer_details["count"] = top_reviewer_details["id_x"].values
top_reviewer_details["id_and_name"] = top_reviewer_details['reviewer_id'].astype("str") +" "+ top_reviewer_details['reviewer_name']
return top_reviewer_details
def get_sentiment_values(reviews_data):
'''
Calculate sentiment if sentiment_values.csv file does not exist. We do this to decrease the time of execution in subsequent runs
'''
if os.path.isfile("../Data/sentiment_values.csv"):
temp_df = pd.read_csv("../Data/sentiment_values.csv")
reviews_data["pattern_opinion"] = temp_df["pattern_opinion"]
reviews_data["nb_opinion"] = temp_df["nb_opinion"]
else:
reviews_data["pattern_opinion"] = sentiment_list()
reviews_data["nb_opinion"] = sentiment_list(nb_analyzer=True)
reviews_data[["pattern_opinion", "nb_opinion"]].to_csv('../Data/sentiment_values.csv',index=False)
return reviews_data
def plot_top_reviewers_and_opinions(limit=30):
'''
Plot top reviewers and their average POS opinions for both NB and pattern based sentiment analysis
'''
fig = make_subplots(rows=3, cols=1, subplot_titles=("Best reviewers", "Average pos opinion based on bayesian sentiment analysis", "Average pos opinion based on pattern based sentiment analysis"),)
review_opinion_df = pd.merge(get_top_reviewers().iloc[:limit,:],
listing_count_reviews_data[["reviewer_id", "pattern_opinion", "nb_opinion"]].groupby(by="reviewer_id").mean().reset_index(),
left_on="reviewer_id", right_on="reviewer_id")
fig.add_trace( go.Bar(y=review_opinion_df["count"][:limit], x = review_opinion_df["id_and_name"][:limit]), row=1, col=1)
fig.add_trace( go.Bar(y = review_opinion_df["nb_opinion"].values[:limit], x = review_opinion_df["id_and_name"].values[:limit]), row=2, col=1)
fig.add_trace( go.Bar(y = review_opinion_df["pattern_opinion"].values[:limit], x = review_opinion_df["id_and_name"].values[:limit]), row=3, col=1)
fig.update_layout(height=700, title = "Reviewers and their opinions",xaxis=dict(tickvals=[]), xaxis2=dict(tickvals=[]), xaxis3=dict(tickvals=[]))
return fig
def plot_busiest_months():
'''
Which months have the highest and lowest bookings
'''
count_by_months = reviews_data['date'].str[5:7].value_counts()
fig = go.Figure(
data=[go.Bar(y = count_by_months.values, x = count_by_months.index)],
layout_title_text="Busiest Months"
)
return fig
listings_data, reviews_data, neighborhood_geo_data = import_data("../Data/listings.csv", "../Data/reviews.csv", "../Data/neighbourhoods.geojson")
data_clean()
reviews_data = get_sentiment_values(reviews_data)
count_merge_data = pd.merge(reviews_data, get_top_reviewers(), left_on='reviewer_id', right_on='reviewer_id')
count_merge_data["count_bins"]=pd.cut(count_merge_data['count'], bins = 3 , labels = [0.3,0.5,1])
count_merge_data["count_bins"] = count_merge_data["count_bins"].astype("float")
# reviews_data.head()
count_merge_data["nb_opinion_bins"] = count_merge_data["nb_opinion"]*count_merge_data["count_bins"]
count_merge_data["pattern_opinion_bins"] = count_merge_data["pattern_opinion"]*count_merge_data["count_bins"]
listing_count_reviews_data = pd.merge(count_merge_data.drop("id", axis=1), listings_data[["id","number_of_reviews"]], right_on="id", left_on="listing_id")
# listing_count_reviews_data["number_of_reviews"].unique()
listing_count_reviews_data["review_bins"]=pd.cut(listing_count_reviews_data['number_of_reviews'], bins = 3 , labels = [0.3,0.5,1])
listing_count_reviews_data["review_bins"] = listing_count_reviews_data["review_bins"].astype("float")
# listing_count_reviews_data.head()
plot_busiest_months()
plot_corr_price()
plot_top_reviewers_and_opinions()
best_worst_listings(listing_count_reviews_data)
listings_with_mean_reviews.columns
Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
'host_name', 'host_since', 'host_location', 'host_about',
'host_response_time', 'host_response_rate', 'host_acceptance_rate',
'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
'host_neighbourhood', 'host_listings_count',
'host_total_listings_count', 'host_verifications',
'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
'maximum_minimum_nights', 'minimum_maximum_nights',
'maximum_maximum_nights', 'minimum_nights_avg_ntm',
'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365', 'calendar_last_scraped', 'number_of_reviews',
'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
'last_review', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'license', 'instant_bookable',
'calculated_host_listings_count',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 'reviews_per_month',
'listing_id', 'nb_opinion', 'pattern_opinion'],
dtype='object')
listings_with_mean_reviews = pd.merge(listings_data, listing_count_reviews_data.groupby(by="listing_id").mean()[["nb_opinion","pattern_opinion"]].reset_index(), left_on="id", right_on="listing_id")
# listings_with_mean_reviews.columns
def create_amenities_list(x):
temp_lst = [0]*5
if x==5:
return [1,1,1,1,1]
else:
if ' washer' in x: temp_lst[0]=1
if 'alarm' in x: temp_lst[1]=1
if 'conditioning' in x: temp_lst[2]=1
if 'essentials' in x: temp_lst[3]=1
if 'wifi' in x: temp_lst[4]=1
return temp_lst
listings_with_mean_reviews[['washer', 'alarm', 'conditioning', 'essentials', "wifi"]] = listings_with_mean_reviews["amenities"].str.replace("\[|\]|\"","").str.lower().apply(lambda x: sorted(list(set(re.findall(" washer|wifi|alarm|conditioning|essentials", x))))).apply(lambda x: create_amenities_list(x)).apply(pd.Series)
# listings_with_mean_reviews.head()
/tmp/ipykernel_4495/4286936106.py:1: FutureWarning: The default value of regex will change from True to False in a future version.
# df = listings_with_mean_reviews["amenities"].str.replace("\[|\]|\"","").str.split(",").explode().str.strip().str.lower().reset_index().drop('index',axis=1)['amenities'].str.split(" ").explode().value_counts().head(50)
# df
def modify_property_type(x):
'''
TODO
'''
if "apartment" in x:return 'apartment'
if "room" in x:return 'room'
if "entire bed" in x: return 'room'
if 'entire place' in x: return 'house'
if "condominium" in x:return 'condominium'
if "suite" in x:return 'suite'
if "loft" in x:return 'loft'
if "casa" in x:return 'house'
if "house" in x:return 'house'
if 'boat' in x:return 'boat'
if 'castle' in x:return 'house'
else:
return "None"
listings_with_mean_reviews["property_type"] = listings_with_mean_reviews["property_type"].apply(lambda x: modify_property_type(x.lower()))
listings_with_mean_reviews = listings_with_mean_reviews.replace(np.nan, None)
listings_with_mean_reviews = listings_with_mean_reviews.fillna({'review_scores_location':listings_with_mean_reviews[["review_scores_location"]].mean()})
from sklearn.cluster import KMeans
kmeans = KMeans(algorithm='elkan').fit(listings_with_mean_reviews[["availability_365", "reviews_per_month", "nb_opinion", 'review_scores_location', "price","washer","conditioning","essentials","alarm","wifi"]])
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [31], in <cell line: 2>() 1 from sklearn.cluster import KMeans ----> 2 kmeans = KMeans(algorithm='elkan').fit(listings_with_mean_reviews[["availability_365", "reviews_per_month", "nb_opinion", 'review_scores_location', "price","washer","conditioning","essentials","alarm","wifi"]]) File ~/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:1137, in KMeans.fit(self, X, y, sample_weight) 1111 def fit(self, X, y=None, sample_weight=None): 1112 """Compute k-means clustering. 1113 1114 Parameters (...) 1135 Fitted estimator. 1136 """ -> 1137 X = self._validate_data( 1138 X, 1139 accept_sparse="csr", 1140 dtype=[np.float64, np.float32], 1141 order="C", 1142 copy=self.copy_x, 1143 accept_large_sparse=False, 1144 ) 1146 self._check_params(X) 1147 random_state = check_random_state(self.random_state) File ~/anaconda3/lib/python3.9/site-packages/sklearn/base.py:566, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params) 564 raise ValueError("Validation should be done on X, y or both.") 565 elif not no_val_X and no_val_y: --> 566 X = check_array(X, **check_params) 567 out = X 568 elif no_val_X and not no_val_y: File ~/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:800, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 794 raise ValueError( 795 "Found array with dim %d. %s expected <= 2." 796 % (array.ndim, estimator_name) 797 ) 799 if force_all_finite: --> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") 802 if ensure_min_samples > 0: 803 n_samples = _num_samples(array) File ~/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:114, in _assert_all_finite(X, allow_nan, msg_dtype) 107 if ( 108 allow_nan 109 and np.isinf(X).any() 110 or not allow_nan 111 and not np.isfinite(X).all() 112 ): 113 type_err = "infinity" if allow_nan else "NaN, infinity" --> 114 raise ValueError( 115 msg_err.format( 116 type_err, msg_dtype if msg_dtype is not None else X.dtype 117 ) 118 ) 119 # for object dtype data, we only check for NaNs (GH-13254) 120 elif X.dtype == np.dtype("object") and not allow_nan: ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
kmeans.get_params()
import json
neighborhood_json = json.load(open("../Data/neighbourhoods.geojson"))
df = listings_data.groupby(by="neighbourhood_cleansed").mean()['price'].reset_index()
center = {'lat':neighborhood_json['features'][0]['geometry']['coordinates'][0][0][0][1],'lon':neighborhood_json['features'][0]['geometry']['coordinates'][0][0][0][0]}
fig = px.choropleth_mapbox(df, geojson=neighborhood_json, locations='neighbourhood_cleansed', color='price',
featureidkey="properties.neighbourhood",
mapbox_style="open-street-map",
zoom=9.5,
opacity=0.5,
center=center,
range_color=(df['price'].min(), df['price'].max()))
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig